In [1]:
pip install pandas
Requirement already satisfied: pandas in c:\users\shrey\anaconda3\lib\site-packages (1.4.2)
Requirement already satisfied: numpy>=1.18.5 in c:\users\shrey\anaconda3\lib\site-packages (from pandas) (1.21.5)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\shrey\anaconda3\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\shrey\anaconda3\lib\site-packages (from pandas) (2021.3)
Requirement already satisfied: six>=1.5 in c:\users\shrey\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [2]:
import pandas as pd
In [3]:
#load the dataset

df = pd.read_csv("C:/Users/shrey/OneDrive/Desktop/project/DataScience_salaries_2024.csv")
In [4]:
print(df.head())
   work_year experience_level employment_type                       job_title  \
0       2021               MI              FT                  Data Scientist   
1       2021               MI              FT                 BI Data Analyst   
2       2020               MI              FT                  Data Scientist   
3       2021               MI              FT                     ML Engineer   
4       2022               SE              FT  Lead Machine Learning Engineer   

     salary salary_currency  salary_in_usd employee_residence  remote_ratio  \
0  30400000             CLP          40038                 CL           100   
1  11000000             HUF          36259                 HU            50   
2  11000000             HUF          35735                 HU            50   
3   8500000             JPY          77364                 JP            50   
4   7500000             INR          95386                 IN            50   

  company_location company_size  
0               CL            L  
1               US            L  
2               HU            L  
3               JP            S  
4               IN            L  
In [5]:
df
Out[5]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2021 MI FT Data Scientist 30400000 CLP 40038 CL 100 CL L
1 2021 MI FT BI Data Analyst 11000000 HUF 36259 HU 50 US L
2 2020 MI FT Data Scientist 11000000 HUF 35735 HU 50 HU L
3 2021 MI FT ML Engineer 8500000 JPY 77364 JP 50 JP S
4 2022 SE FT Lead Machine Learning Engineer 7500000 INR 95386 IN 50 IN L
... ... ... ... ... ... ... ... ... ... ... ...
14833 2022 MI FT Business Intelligence Developer 15000 USD 15000 GH 100 GH M
14834 2020 EX FT Staff Data Analyst 15000 USD 15000 NG 0 CA M
14835 2021 EN FT Machine Learning Developer 15000 USD 15000 TH 100 TH L
14836 2022 EN FT Data Analyst 15000 USD 15000 ID 0 ID L
14837 2020 EN PT ML Engineer 14000 EUR 15966 DE 100 DE S

14838 rows × 11 columns

In [6]:
summary_statistics= df.describe()
df.describe()
Out[6]:
work_year salary salary_in_usd remote_ratio
count 14838.000000 1.483800e+04 14838.000000 14838.000000
mean 2023.138900 1.650227e+05 149874.718763 32.760480
std 0.700799 3.562354e+05 69009.181349 46.488278
min 2020.000000 1.400000e+04 15000.000000 0.000000
25% 2023.000000 1.021000e+05 102000.000000 0.000000
50% 2023.000000 1.422000e+05 141300.000000 0.000000
75% 2024.000000 1.875000e+05 185900.000000 100.000000
max 2024.000000 3.040000e+07 800000.000000 100.000000
In [7]:
data_types_missing= df.info()
#there are no null values in the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14838 entries, 0 to 14837
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           14838 non-null  int64 
 1   experience_level    14838 non-null  object
 2   employment_type     14838 non-null  object
 3   job_title           14838 non-null  object
 4   salary              14838 non-null  int64 
 5   salary_currency     14838 non-null  object
 6   salary_in_usd       14838 non-null  int64 
 7   employee_residence  14838 non-null  object
 8   remote_ratio        14838 non-null  int64 
 9   company_location    14838 non-null  object
 10  company_size        14838 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.2+ MB
In [8]:
print(df.nunique())
work_year                5
experience_level         4
employment_type          4
job_title              153
salary                2363
salary_currency         23
salary_in_usd         2730
employee_residence      88
remote_ratio             3
company_location        77
company_size             3
dtype: int64
In [9]:
#transformation of the codes of the categoricl variables
df['experience_level'] = df['experience_level'].replace({'SE': 'Expert', 'MI': 'Intermediate', 'EN': 'Junior', 'EX': 'Director'})

df['employment_type'] = df['employment_type'].replace({'FT': 'Full-time', 'CT': 'Contract', 'FL': 'Freelance', 'PT': 'Part-time'})

def country_name(country_code):
    try:
        return pycountry.countries.get(alpha_2=country_code).name
    except:
        return 'other'
    
df['company_location'] = df['company_location'].apply(country_name)
df['employee_residence'] = df['employee_residence'].apply(country_name)
In [10]:
# Categorical variables

for column in ['work_year','experience_level','employment_type','company_size','remote_ratio','job_title','company_location']:
    print(df[column].unique())
[2021 2020 2022 2023 2024]
['Intermediate' 'Expert' 'Junior' 'Director']
['Full-time' 'Freelance' 'Contract' 'Part-time']
['L' 'S' 'M']
[100  50   0]
['Data Scientist' 'BI Data Analyst' 'ML Engineer'
 'Lead Machine Learning Engineer' 'Data Science Manager'
 'Head of Machine Learning' 'Research Engineer' 'Head of Data Science'
 'AI Programmer' 'Machine Learning Engineer' 'Lead Data Scientist'
 'Data Engineer' 'Applied Machine Learning Scientist' 'Lead Data Analyst'
 'Data Analytics Manager' 'Data Integration Specialist'
 'Principal Data Architect' 'NLP Engineer' 'Big Data Engineer'
 'AI Research Engineer' 'Machine Learning Software Engineer'
 'Data Analyst' 'Applied Data Scientist' 'AI Scientist'
 'Data Analytics Lead' 'Business Data Analyst' 'Product Data Analyst'
 'Computer Vision Engineer' 'Data Science Consultant' 'AI Architect'
 'Analytics Engineer' 'Machine Learning Scientist' 'Research Scientist'
 'Prompt Engineer' 'Principal Data Scientist' 'Applied Scientist'
 'Deep Learning Engineer' 'Data Architect' 'AI Engineer'
 'Data Infrastructure Engineer' 'Data Science' 'Director of Data Science'
 'Data Science Tech Lead' 'BI Analyst' 'Data Lead' 'Head of Data'
 'Analytics Engineering Manager' 'Data Product Owner'
 'Business Intelligence' 'Machine Learning Infrastructure Engineer'
 'Research Analyst' 'Managing Director Data Science'
 'Finance Data Analyst' 'Data Operations Engineer'
 'Robotics Software Engineer' 'MLOps Engineer' 'Lead Data Engineer'
 'AI Developer' 'Data Science Lead' 'Data Science Engineer'
 'Business Intelligence Manager' 'Business Intelligence Specialist'
 'Business Intelligence Engineer' 'Data Modeler' 'AWS Data Architect'
 'Machine Learning Researcher' 'Data Product Manager'
 'Data Strategy Manager' 'ETL Developer' 'Cloud Data Architect'
 'Computational Biologist' 'AI Software Engineer' 'Data Strategist'
 'Data Manager' 'Business Intelligence Analyst'
 'Applied Machine Learning Engineer' 'Data Specialist'
 'AI Product Manager' 'Data Science Director' 'Software Data Engineer'
 'Data Operations Specialist' 'BI Developer' 'Machine Learning Modeler'
 'Decision Scientist' 'Data Management Specialist' 'Data Analyst Lead'
 'Marketing Data Analyst' 'Machine Learning Manager'
 'Principal Data Engineer' 'Data Science Practitioner' 'ML Ops Engineer'
 'Principal Machine Learning Engineer' 'Data Quality Engineer'
 'Cloud Database Engineer' 'Staff Machine Learning Engineer'
 'Data Visualization Specialist' 'Data Scientist Lead' 'Robotics Engineer'
 'Data Integration Developer' 'Data Developer'
 'Machine Learning Developer' 'Staff Data Analyst'
 'Business Intelligence Developer' 'Business Intelligence Lead'
 'Data Pipeline Engineer' 'Data Quality Analyst'
 'Data Visualization Engineer' 'Principal Data Analyst'
 'Staff Data Scientist' 'Data Integration Engineer'
 'Machine Learning Operations Engineer' 'Consultant Data Engineer'
 'Cloud Data Engineer' 'Data Analytics Specialist'
 'Data Operations Analyst' 'Data Science Analyst' 'AI Research Scientist'
 'ETL Engineer' 'Computer Vision Software Engineer'
 'Data Management Analyst' 'Data Operations Manager'
 'Data Analytics Engineer' 'Data Visualization Analyst'
 'Financial Data Analyst' 'Manager Data Management' 'Big Data Architect'
 'Data Analytics Consultant' 'Autonomous Vehicle Technician'
 'Big Data Developer' 'Deep Learning Researcher'
 'Machine Learning Research Engineer' 'Data Management Consultant'
 'Azure Data Engineer' 'Business Intelligence Data Analyst'
 'Data Analytics Associate' 'Data Reporting Analyst'
 'Marketing Data Scientist' 'Data Modeller' 'Marketing Data Engineer'
 'Data DevOps Engineer' 'Data Operations Associate'
 'Machine Learning Specialist' 'Encounter Data Management Professional'
 'Applied Research Scientist' 'Admin & Data Analyst' 'Power BI Developer'
 'Compliance Data Analyst' 'Sales Data Analyst' 'BI Data Engineer'
 'Data Quality Manager' 'Quantitative Research Analyst' 'Insight Analyst'
 'CRM Data Analyst']
['other']
In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the "job title" column
job_titles = df['job_title']

# Calculate the frequency of each job title
title_counts = job_titles.value_counts()

# Extract the top 20 most frequent job titles
top_20_titles = title_counts.head(20)

# Create a DataFrame for the top 20 titles
top_20_df = pd.DataFrame({'Job Title': top_20_titles.index, 'Count': top_20_titles.values})
In [12]:
# Plotting the count plot
plt.figure(figsize=(12, 6))
sns.set(style="darkgrid")
ax = sns.barplot(data=top_20_df, x='Count', y='Job Title', palette='cubehelix')
plt.xlabel('Count')
plt.ylabel('Job Titles')
plt.title('Top 20 Most Frequent Job Titles')

# Add count labels to the bars
for i, v in enumerate(top_20_df['Count']):
    ax.text(v + 0.2, i, str(v), color='black', va='center')

plt.tight_layout()
plt.show()
In [13]:
#calculate the number of individuals in each experience level
level_counts = df['experience_level'].value_counts()

# Create a pie chart
plt.figure(figsize=(7,12),dpi=80)
plt.pie(level_counts.values, labels=level_counts.index, autopct='%1.1f%%')
plt.title('Experience Level Distribution')

plt.show()
In [14]:
# Create a cross-tabulation of the two columns
cross_tab = pd.crosstab(df['experience_level'], df['company_size'])

# Create a heatmap using the cross-tabulation data
plt.figure(figsize=(10, 8))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap='Reds')

plt.xlabel('Company Size')
plt.ylabel('Experience Level')
plt.title('Relationship between Experience Level and Company Size')

plt.show()
In [15]:
import matplotlib.pyplot as plt
from matplotlib import ticker
# Create bar chart
average_salary = df.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)
top_ten_salaries = average_salary.head(10)

plt.figure(figsize=(15,10),dpi=80)
plt.bar(top_ten_salaries.index, top_ten_salaries)

# Add labels to the chart
plt.xlabel('Job')
plt.ylabel('Salary $')
plt.title('Average of the ten highest salaries by Job Titles')
plt.xticks(rotation=35, ha='right')
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

plt.show()
In [16]:
import plotly.express as px
salary_by_country = df.groupby('company_location', as_index=False)['salary_in_usd'].mean()

fig = px.choropleth(salary_by_country,locations='company_location',locationmode='country names',color='salary_in_usd',
                    projection='equirectangular',hover_name='company_location',
                    labels={'salary_in_usd':'Average Salary in USD'},title='Distribution of average salary by company location')


fig.show("notebook")
In [17]:
common_jobs = ['Data Engineer', 'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Analytics Engineer','Research Scientist', 'Data Science Manager', 'Applied Scientist']
common_jobs = df[df['job_title'].isin(common_jobs)]
In [18]:
salary_common_jobs = common_jobs.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)
remote_common_jobs = common_jobs.groupby('job_title')['remote_ratio'].mean().sort_values(ascending=False)
salary_common_country = common_jobs.groupby('company_location')['salary_in_usd'].mean().sort_values(ascending=False)
In [19]:
# Create bar chart
salary_common_jobs = common_jobs.groupby('job_title')['salary_in_usd'].mean().sort_values(ascending=False)

plt.figure(figsize=(15,10),dpi=80)
plt.bar(salary_common_jobs.index, salary_common_jobs)

# Add labels to the chart
plt.xlabel('Job')
plt.ylabel('Salary $')
plt.title('Average salary for common Job Titles')
plt.xticks(rotation=20, ha='right')
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

plt.show()
In [20]:
# Create bar chart
remote_common_jobs = common_jobs.groupby('job_title')['remote_ratio'].mean().sort_values(ascending=False)

plt.figure(figsize=(15,10),dpi=80)
plt.bar(remote_common_jobs.index, remote_common_jobs)

# Add labels to the chart
plt.xlabel('Job')
plt.ylabel('% remote')
plt.title('Remote rate by Job Titles')
plt.xticks(rotation=20, ha='right')
plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))

plt.show()
In [21]:
# Distribution of experience_level
print(df['experience_level'].value_counts())

# Distribution of employment_type
print(df['employment_type'].value_counts())

# Distribution of company_size
print(df['company_size'].value_counts())
Expert          9696
Intermediate    3553
Junior          1148
Director         441
Name: experience_level, dtype: int64
Full-time    14772
Part-time       27
Contract        26
Freelance       13
Name: employment_type, dtype: int64
M    13674
L      983
S      181
Name: company_size, dtype: int64
In [22]:
# Average salary by experience level
print(df.groupby('experience_level')['salary_in_usd'].mean().sort_values(ascending=False))
experience_level
Director        194730.210884
Expert          163700.967100
Intermediate    125386.553054
Junior           91656.841463
Name: salary_in_usd, dtype: float64
In [23]:
pip install matplotlib seaborn
Requirement already satisfied: matplotlib in c:\users\shrey\anaconda3\lib\site-packages (3.5.1)
Requirement already satisfied: seaborn in c:\users\shrey\anaconda3\lib\site-packages (0.11.2)
Requirement already satisfied: pillow>=6.2.0 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (9.0.1)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (3.0.4)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: packaging>=20.0 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (21.3)
Requirement already satisfied: cycler>=0.10 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: numpy>=1.17 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (1.21.5)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\shrey\anaconda3\lib\site-packages (from matplotlib) (1.3.2)
Requirement already satisfied: pandas>=0.23 in c:\users\shrey\anaconda3\lib\site-packages (from seaborn) (1.4.2)
Requirement already satisfied: scipy>=1.0 in c:\users\shrey\anaconda3\lib\site-packages (from seaborn) (1.7.3)
Requirement already satisfied: pytz>=2020.1 in c:\users\shrey\anaconda3\lib\site-packages (from pandas>=0.23->seaborn) (2021.3)
Requirement already satisfied: six>=1.5 in c:\users\shrey\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [25]:
# Plotting average salary by experience level
plt.figure(figsize=(10, 6))
sns.barplot(x=df['experience_level'], y=df['salary_in_usd'], estimator=np.mean)
plt.xlabel('Experience Level')
plt.ylabel('Average Salary in USD')
plt.title('Average Salary by Experience Level')
plt.show()
In [26]:
# Correlation matrix
correlation_matrix = df.corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()
In [27]:
# Bar chart for average salary by experience level
plt.figure(figsize=(10, 6))
sns.barplot(x='experience_level', y='salary_in_usd', data=df, estimator=np.mean, ci=None)
plt.title('Average Salary by Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Average Salary in USD')
plt.show()
In [ ]: